/*-------------------------------------------------------------------------
 *
 * ts_parse.c
 *        main parse functions for tsearch
 *
 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
 *
 *
 * IDENTIFICATION
 *      src/backend/tsearch/ts_parse.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "tsearch/ts_cache.h"
#include "tsearch/ts_utils.h"

#define IGNORE_LONGLEXEME    1

/*
 * Lexize subsystem
 */

typedef struct ParsedLex
{
    int            type;
    char       *lemm;
    int            lenlemm;
    struct ParsedLex *next;
} ParsedLex;

typedef struct ListParsedLex
{
    ParsedLex  *head;
    ParsedLex  *tail;
} ListParsedLex;

typedef struct
{
    TSConfigCacheEntry *cfg;
    Oid            curDictId;
    int            posDict;
    DictSubState dictState;
    ParsedLex  *curSub;
    ListParsedLex towork;        /* current list to work */
    ListParsedLex waste;        /* list of lexemes that already lexized */

    /*
     * fields to store last variant to lexize (basically, thesaurus or similar
     * to, which wants    several lexemes
     */

    ParsedLex  *lastRes;
    TSLexeme   *tmpRes;
} LexizeData;

static void
LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
{
    ld->cfg = cfg;
    ld->curDictId = InvalidOid;
    ld->posDict = 0;
    ld->towork.head = ld->towork.tail = ld->curSub = NULL;
    ld->waste.head = ld->waste.tail = NULL;
    ld->lastRes = NULL;
    ld->tmpRes = NULL;
}

static void
LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
{
    if (list->tail)
    {
        list->tail->next = newpl;
        list->tail = newpl;
    }
    else
        list->head = list->tail = newpl;
    newpl->next = NULL;
}

static ParsedLex *
LPLRemoveHead(ListParsedLex *list)
{
    ParsedLex  *res = list->head;

    if (list->head)
        list->head = list->head->next;

    if (list->head == NULL)
        list->tail = NULL;

    return res;
}

static void
LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
{
    ParsedLex  *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));

    newpl->type = type;
    newpl->lemm = lemm;
    newpl->lenlemm = lenlemm;
    LPLAddTail(&ld->towork, newpl);
    ld->curSub = ld->towork.tail;
}

static void
RemoveHead(LexizeData *ld)
{
    LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));

    ld->posDict = 0;
}

static void
setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
{
    if (correspondLexem)
    {
        *correspondLexem = ld->waste.head;
    }
    else
    {
        ParsedLex  *tmp,
                   *ptr = ld->waste.head;

        while (ptr)
        {
            tmp = ptr->next;
            pfree(ptr);
            ptr = tmp;
        }
    }
    ld->waste.head = ld->waste.tail = NULL;
}

static void
moveToWaste(LexizeData *ld, ParsedLex *stop)
{
    bool        go = true;

    while (ld->towork.head && go)
    {
        if (ld->towork.head == stop)
        {
            ld->curSub = stop->next;
            go = false;
        }
        RemoveHead(ld);
    }
}

static void
setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
{
    if (ld->tmpRes)
    {
        TSLexeme   *ptr;

        for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
            pfree(ptr->lexeme);
        pfree(ld->tmpRes);
    }
    ld->tmpRes = res;
    ld->lastRes = lex;
}

static TSLexeme *
LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
{// #lizard forgives
    int            i;
    ListDictionary *map;
    TSDictionaryCacheEntry *dict;
    TSLexeme   *res;

    if (ld->curDictId == InvalidOid)
    {
        /*
         * usual mode: dictionary wants only one word, but we should keep in
         * mind that we should go through all stack
         */

        while (ld->towork.head)
        {
            ParsedLex  *curVal = ld->towork.head;
            char       *curValLemm = curVal->lemm;
            int            curValLenLemm = curVal->lenlemm;

            map = ld->cfg->map + curVal->type;

            if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
            {
                /* skip this type of lexeme */
                RemoveHead(ld);
                continue;
            }

            for (i = ld->posDict; i < map->len; i++)
            {
                dict = lookup_ts_dictionary_cache(map->dictIds[i]);

                ld->dictState.isend = ld->dictState.getnext = false;
                ld->dictState.private_state = NULL;
                res = (TSLexeme *) DatumGetPointer(FunctionCall4(
                                                                 &(dict->lexize),
                                                                 PointerGetDatum(dict->dictData),
                                                                 PointerGetDatum(curValLemm),
                                                                 Int32GetDatum(curValLenLemm),
                                                                 PointerGetDatum(&ld->dictState)
                                                                 ));

                if (ld->dictState.getnext)
                {
                    /*
                     * dictionary wants next word, so setup and store current
                     * position and go to multiword mode
                     */

                    ld->curDictId = DatumGetObjectId(map->dictIds[i]);
                    ld->posDict = i + 1;
                    ld->curSub = curVal->next;
                    if (res)
                        setNewTmpRes(ld, curVal, res);
                    return LexizeExec(ld, correspondLexem);
                }

                if (!res)        /* dictionary doesn't know this lexeme */
                    continue;

                if (res->flags & TSL_FILTER)
                {
                    curValLemm = res->lexeme;
                    curValLenLemm = strlen(res->lexeme);
                    continue;
                }

                RemoveHead(ld);
                setCorrLex(ld, correspondLexem);
                return res;
            }

            RemoveHead(ld);
        }
    }
    else
    {                            /* curDictId is valid */
        dict = lookup_ts_dictionary_cache(ld->curDictId);

        /*
         * Dictionary ld->curDictId asks  us about following words
         */

        while (ld->curSub)
        {
            ParsedLex  *curVal = ld->curSub;

            map = ld->cfg->map + curVal->type;

            if (curVal->type != 0)
            {
                bool        dictExists = false;

                if (curVal->type >= ld->cfg->lenmap || map->len == 0)
                {
                    /* skip this type of lexeme */
                    ld->curSub = curVal->next;
                    continue;
                }

                /*
                 * We should be sure that current type of lexeme is recognized
                 * by our dictionary: we just check is it exist in list of
                 * dictionaries ?
                 */
                for (i = 0; i < map->len && !dictExists; i++)
                    if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
                        dictExists = true;

                if (!dictExists)
                {
                    /*
                     * Dictionary can't work with current tpe of lexeme,
                     * return to basic mode and redo all stored lexemes
                     */
                    ld->curDictId = InvalidOid;
                    return LexizeExec(ld, correspondLexem);
                }
            }

            ld->dictState.isend = (curVal->type == 0) ? true : false;
            ld->dictState.getnext = false;

            res = (TSLexeme *) DatumGetPointer(FunctionCall4(
                                                             &(dict->lexize),
                                                             PointerGetDatum(dict->dictData),
                                                             PointerGetDatum(curVal->lemm),
                                                             Int32GetDatum(curVal->lenlemm),
                                                             PointerGetDatum(&ld->dictState)
                                                             ));

            if (ld->dictState.getnext)
            {
                /* Dictionary wants one more */
                ld->curSub = curVal->next;
                if (res)
                    setNewTmpRes(ld, curVal, res);
                continue;
            }

            if (res || ld->tmpRes)
            {
                /*
                 * Dictionary normalizes lexemes, so we remove from stack all
                 * used lexemes, return to basic mode and redo end of stack
                 * (if it exists)
                 */
                if (res)
                {
                    moveToWaste(ld, ld->curSub);
                }
                else
                {
                    res = ld->tmpRes;
                    moveToWaste(ld, ld->lastRes);
                }

                /* reset to initial state */
                ld->curDictId = InvalidOid;
                ld->posDict = 0;
                ld->lastRes = NULL;
                ld->tmpRes = NULL;
                setCorrLex(ld, correspondLexem);
                return res;
            }

            /*
             * Dict don't want next lexem and didn't recognize anything, redo
             * from ld->towork.head
             */
            ld->curDictId = InvalidOid;
            return LexizeExec(ld, correspondLexem);
        }
    }

    setCorrLex(ld, correspondLexem);
    return NULL;
}

/*
 * Parse string and lexize words.
 *
 * prs will be filled in.
 */
void
parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
{// #lizard forgives
    int            type,
                lenlemm;
    char       *lemm = NULL;
    LexizeData    ldata;
    TSLexeme   *norms;
    TSConfigCacheEntry *cfg;
    TSParserCacheEntry *prsobj;
    void       *prsdata;

    cfg = lookup_ts_config_cache(cfgId);
    prsobj = lookup_ts_parser_cache(cfg->prsId);

    prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
                                                     PointerGetDatum(buf),
                                                     Int32GetDatum(buflen)));

    LexizeInit(&ldata, cfg);

    do
    {
        type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
                                           PointerGetDatum(prsdata),
                                           PointerGetDatum(&lemm),
                                           PointerGetDatum(&lenlemm)));

        if (type > 0 && lenlemm >= MAXSTRLEN)
        {
#ifdef IGNORE_LONGLEXEME
            ereport(NOTICE,
                    (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                     errmsg("word is too long to be indexed"),
                     errdetail("Words longer than %d characters are ignored.",
                               MAXSTRLEN)));
            continue;
#else
            ereport(ERROR,
                    (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                     errmsg("word is too long to be indexed"),
                     errdetail("Words longer than %d characters are ignored.",
                               MAXSTRLEN)));
#endif
        }

        LexizeAddLemm(&ldata, type, lemm, lenlemm);

        while ((norms = LexizeExec(&ldata, NULL)) != NULL)
        {
            TSLexeme   *ptr = norms;

            prs->pos++;            /* set pos */

            while (ptr->lexeme)
            {
                if (prs->curwords == prs->lenwords)
                {
                    prs->lenwords *= 2;
                    prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
                }

                if (ptr->flags & TSL_ADDPOS)
                    prs->pos++;
                prs->words[prs->curwords].len = strlen(ptr->lexeme);
                prs->words[prs->curwords].word = ptr->lexeme;
                prs->words[prs->curwords].nvariant = ptr->nvariant;
                prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
                prs->words[prs->curwords].alen = 0;
                prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
                ptr++;
                prs->curwords++;
            }
            pfree(norms);
        }
    } while (type > 0);

    FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}

/*
 * Headline framework
 */
static void
hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
{
    while (prs->curwords >= prs->lenwords)
    {
        prs->lenwords *= 2;
        prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
    }
    memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
    prs->words[prs->curwords].type = (uint8) type;
    prs->words[prs->curwords].len = buflen;
    prs->words[prs->curwords].word = palloc(buflen);
    memcpy(prs->words[prs->curwords].word, buf, buflen);
    prs->curwords++;
}

static void
hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
{
    int            i;
    QueryItem  *item = GETQUERY(query);
    HeadlineWordEntry *word;

    while (prs->curwords + query->size >= prs->lenwords)
    {
        prs->lenwords *= 2;
        prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
    }

    word = &(prs->words[prs->curwords - 1]);
    word->pos = LIMITPOS(pos);
    for (i = 0; i < query->size; i++)
    {
        if (item->type == QI_VAL &&
            tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
                            buf, buflen, item->qoperand.prefix) == 0)
        {
            if (word->item)
            {
                memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
                prs->words[prs->curwords].item = &item->qoperand;
                prs->words[prs->curwords].repeated = 1;
                prs->curwords++;
            }
            else
                word->item = &item->qoperand;
        }
        item++;
    }
}

static void
addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
{// #lizard forgives
    ParsedLex  *tmplexs;
    TSLexeme   *ptr;
    int32        savedpos;

    while (lexs)
    {
        if (lexs->type > 0)
            hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);

        ptr = norms;
        savedpos = prs->vectorpos;
        while (ptr && ptr->lexeme)
        {
            if (ptr->flags & TSL_ADDPOS)
                savedpos++;
            hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
            ptr++;
        }

        tmplexs = lexs->next;
        pfree(lexs);
        lexs = tmplexs;
    }

    if (norms)
    {
        ptr = norms;
        while (ptr->lexeme)
        {
            if (ptr->flags & TSL_ADDPOS)
                prs->vectorpos++;
            pfree(ptr->lexeme);
            ptr++;
        }
        pfree(norms);
    }
}

void
hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
{
    int            type,
                lenlemm;
    char       *lemm = NULL;
    LexizeData    ldata;
    TSLexeme   *norms;
    ParsedLex  *lexs;
    TSConfigCacheEntry *cfg;
    TSParserCacheEntry *prsobj;
    void       *prsdata;

    cfg = lookup_ts_config_cache(cfgId);
    prsobj = lookup_ts_parser_cache(cfg->prsId);

    prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
                                                     PointerGetDatum(buf),
                                                     Int32GetDatum(buflen)));

    LexizeInit(&ldata, cfg);

    do
    {
        type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
                                           PointerGetDatum(prsdata),
                                           PointerGetDatum(&lemm),
                                           PointerGetDatum(&lenlemm)));

        if (type > 0 && lenlemm >= MAXSTRLEN)
        {
#ifdef IGNORE_LONGLEXEME
            ereport(NOTICE,
                    (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                     errmsg("word is too long to be indexed"),
                     errdetail("Words longer than %d characters are ignored.",
                               MAXSTRLEN)));
            continue;
#else
            ereport(ERROR,
                    (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                     errmsg("word is too long to be indexed"),
                     errdetail("Words longer than %d characters are ignored.",
                               MAXSTRLEN)));
#endif
        }

        LexizeAddLemm(&ldata, type, lemm, lenlemm);

        do
        {
            if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
            {
                prs->vectorpos++;
                addHLParsedLex(prs, query, lexs, norms);
            }
            else
                addHLParsedLex(prs, query, lexs, NULL);
        } while (norms);

    } while (type > 0);

    FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}

text *
generateHeadline(HeadlineParsedText *prs)
{// #lizard forgives
    text       *out;
    char       *ptr;
    int            len = 128;
    int            numfragments = 0;
    int16        infrag = 0;

    HeadlineWordEntry *wrd = prs->words;

    out = (text *) palloc(len);
    ptr = ((char *) out) + VARHDRSZ;

    while (wrd - prs->words < prs->curwords)
    {
        while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
        {
            int            dist = ptr - ((char *) out);

            len *= 2;
            out = (text *) repalloc(out, len);
            ptr = ((char *) out) + dist;
        }

        if (wrd->in && !wrd->repeated)
        {
            if (!infrag)
            {

                /* start of a new fragment */
                infrag = 1;
                numfragments++;
                /* add a fragment delimiter if this is after the first one */
                if (numfragments > 1)
                {
                    memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
                    ptr += prs->fragdelimlen;
                }

            }
            if (wrd->replace)
            {
                *ptr = ' ';
                ptr++;
            }
            else if (!wrd->skip)
            {
                if (wrd->selected)
                {
                    memcpy(ptr, prs->startsel, prs->startsellen);
                    ptr += prs->startsellen;
                }
                memcpy(ptr, wrd->word, wrd->len);
                ptr += wrd->len;
                if (wrd->selected)
                {
                    memcpy(ptr, prs->stopsel, prs->stopsellen);
                    ptr += prs->stopsellen;
                }
            }
        }
        else if (!wrd->repeated)
        {
            if (infrag)
                infrag = 0;
            pfree(wrd->word);
        }

        wrd++;
    }

    SET_VARSIZE(out, ptr - ((char *) out));
    return out;
}
